{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Following the results from 11/06, let's look a bit more into the calls common to GATK and triodenovo:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from scipy import stats\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: SLURM_JOB_ID=53272018\n",
      "env: SLURM_CPUS_PER_TASK=32\n"
     ]
    }
   ],
   "source": [
    "# need to make this less clunky in the future!\n",
    "%env SLURM_JOB_ID=53272018\n",
    "%env SLURM_CPUS_PER_TASK=32"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GATK high conf and triodenovo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%bash\n",
    "\n",
    "cd ~/data/tmp/\n",
    "suffix=hiConfANDtriodenovo\n",
    "\n",
    "echo 'famid,popmax_MAFltp01,affectedOnly' > rare_variants_affectedOnly_${suffix}.csv\n",
    "\n",
    "while read fam; do\n",
    "    awk '$2 < .01 {OFS=\":\"; print $3, $4}' ${fam}_trio1_${suffix}.avinput.hg19_popfreq_max_20150413_dropped > ${fam}_${suffix}_popmax_MAFltp01.txt;\n",
    "    if [ -e /data/NCR_SBRB/simplex/triodenovo/${fam}_trio2_denovo_v2.vcf ]; then\n",
    "        rm possible_snvs.txt unaffected_snvs.txt\n",
    "        # let's be conservative here and disregard the SNV if it was picked up\n",
    "        # by hiConf OR triodenovo unaffected\n",
    "        cat /data/NCR_SBRB/simplex/gatk_refine/${fam}_trio[2..4]_hiConfDeNovo.vcf | grep -v '#' - | awk 'BEGIN {FS=\"\\t\"; OFS=\":\"}; {print $1, $2}' - | sort | uniq >> unaffected_snvs.txt;\n",
    "        cat /data/NCR_SBRB/simplex/triodenovo/${fam}_trio[2..4]_denovo_v2.vcf | grep -v '#' - | awk 'BEGIN {FS=\"\\t\"; OFS=\":\"}; {print $1, $2}' - | sort | uniq >> unaffected_snvs.txt;\n",
    "        # for each possible SNV in affected trio, mark it as interesting if it's not\n",
    "        # in the unnafected trios\n",
    "        while read snv; do\n",
    "            if ! grep -q \"$snv\" unaffected_snvs.txt; then\n",
    "                echo $snv >> possible_snvs.txt;\n",
    "            fi;\n",
    "        done < ${fam}_${suffix}_popmax_MAFltp01.txt;\n",
    "        naffected=`cat possible_snvs.txt | wc -l`\n",
    "    else\n",
    "        naffected='NA'\n",
    "    fi;\n",
    "    nMAFpopmax=`awk '$2 < .01' ${fam}_trio1_${suffix}.avinput.hg19_popfreq_max_20150413_dropped | wc -l`\n",
    "    echo $fam,$nMAFpopmax,$naffected >> rare_variants_affectedOnly_${suffix}.csv;\n",
    "done < /data/NCR_SBRB/simplex/famids.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>famid</th>\n",
       "      <th>popmax_MAFltp01</th>\n",
       "      <th>affectedOnly</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10033</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10042</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10090</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10094</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10128</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>10131</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>10153</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>10164</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>10173</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10178</td>\n",
       "      <td>11</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>10182</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>10197</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>10215</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>10369</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>10406</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>10448</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>1892</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>1893</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>1895</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>1976</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>855</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    famid  popmax_MAFltp01  affectedOnly\n",
       "0   10033                3             3\n",
       "1   10042                6           NaN\n",
       "2   10090                3             3\n",
       "3   10094                3             3\n",
       "4   10128                7             7\n",
       "5   10131                4             1\n",
       "6   10153                4             3\n",
       "7   10164                2             2\n",
       "8   10173                5             5\n",
       "9   10178               11            10\n",
       "10  10182                6             6\n",
       "11  10197                2             2\n",
       "12  10215                3             3\n",
       "13  10369                1             1\n",
       "14  10406               10            10\n",
       "15  10448                5             5\n",
       "16   1892                3             3\n",
       "17   1893                3             3\n",
       "18   1895                5             4\n",
       "19   1976                5             4\n",
       "20    855                6             6"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res = pd.read_csv('/home/sudregp/data/tmp/rare_variants_affectedOnly_hiConfANDtriodenovo.csv')\n",
    "res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So, we have a handful of candidates here... where are they in the genome?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%bash\n",
    "\n",
    "cd ~/data/tmp/\n",
    "suffix=hiConfANDtriodenovo\n",
    "\n",
    "while read fam; do\n",
    "    awk '$2 < .01 {OFS=\":\"; print $3, $4}' ${fam}_trio1_${suffix}.avinput.hg19_popfreq_max_20150413_dropped > ${fam}_${suffix}_popmax_MAFltp01.txt;\n",
    "    if [ -e /data/NCR_SBRB/simplex/triodenovo/${fam}_trio2_denovo_v2.vcf ]; then\n",
    "        rm unaffected_snvs.txt\n",
    "        # let's be conservative here and disregard the SNV if it was picked up\n",
    "        # by hiConf OR triodenovo unaffected\n",
    "        cat /data/NCR_SBRB/simplex/gatk_refine/${fam}_trio[2..4]_hiConfDeNovo.vcf | grep -v '#' - | awk 'BEGIN {FS=\"\\t\"; OFS=\":\"}; {print $1, $2}' - | sort | uniq >> unaffected_snvs.txt;\n",
    "        cat /data/NCR_SBRB/simplex/triodenovo/${fam}_trio[2..4]_denovo_v2.vcf | grep -v '#' - | awk 'BEGIN {FS=\"\\t\"; OFS=\":\"}; {print $1, $2}' - | sort | uniq >> unaffected_snvs.txt;\n",
    "        # for each possible SNV in affected trio, mark it as interesting if it's not\n",
    "        # in the unnafected trios\n",
    "        while read snv; do\n",
    "            if ! grep -q \"$snv\" unaffected_snvs.txt; then\n",
    "                echo $snv >> possible_snvs_${fam}_${suffix}.txt;\n",
    "            fi;\n",
    "        done < ${fam}_${suffix}_popmax_MAFltp01.txt;\n",
    "    else\n",
    "        cp ${fam}_${suffix}_popmax_MAFltp01.txt possible_snvs_${fam}_${suffix}.txt\n",
    "    fi;\n",
    "done < /data/NCR_SBRB/simplex/famids.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%bash\n",
    "\n",
    "\n",
    "module load annovar\n",
    "cd ~/data/tmp/\n",
    "suffix=hiConfANDtriodenovo\n",
    "\n",
    "echo 'famid,affectedOnly,exonic,intronic,splicing,intergenic,utr3,downstream' > rare_variants_affectedOnly_refGene_${suffix}.csv\n",
    "\n",
    "while read fam; do\n",
    "    rm ${fam}_trio1_possibleOnly_${suffix}.avinput;\n",
    "    for pos in `cut -f 2 -d \":\" possible_snvs_${fam}_${suffix}.txt`; do\n",
    "        grep $pos ${fam}_trio1_${suffix}.avinput >> ${fam}_trio1_possibleOnly_${suffix}.avinput;\n",
    "    done\n",
    "    annotate_variation.pl -buildver hg19 ${fam}_trio1_possibleOnly_${suffix}.avinput $ANNOVAR_DATA/hg19;\n",
    "    naffected=`cat possible_snvs_${fam}_${suffix}.txt | wc -l`;\n",
    "    nexon=`grep exonic ${fam}_trio1_possibleOnly_${suffix}.avinput.variant_function | wc -l`;\n",
    "    nintron=`grep intronic ${fam}_trio1_possibleOnly_${suffix}.avinput.variant_function | wc -l`;\n",
    "    nsplice=`grep splicing ${fam}_trio1_possibleOnly_${suffix}.avinput.variant_function | wc -l`;\n",
    "    ninter=`grep intergenic ${fam}_trio1_possibleOnly_${suffix}.avinput.variant_function | wc -l`;\n",
    "    nutr=`grep UTR3 ${fam}_trio1_possibleOnly_${suffix}.avinput.variant_function | wc -l`;\n",
    "    ndown=`grep downstream ${fam}_trio1_possibleOnly_${suffix}.avinput.variant_function | wc -l`;\n",
    "    echo $fam,$naffected,$nexon,$nintron,$nsplice,$ninter,$nutr,$ndown >> rare_variants_affectedOnly_refGene_${suffix}.csv;\n",
    "done < /data/NCR_SBRB/simplex/famids.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GATK all and triodenovo overlap\n",
    "\n",
    "While we run the intersection between hiConf GATK and triodenovo, let's also check all denovo calls from GATK:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10033 96\n",
      "10042 76\n",
      "10090 80\n",
      "10094 69\n",
      "10128 71\n",
      "10131 57\n",
      "10153 73\n",
      "10164 92\n",
      "10173 68\n",
      "10178 142\n",
      "10182 67\n",
      "10197 103\n",
      "10215 88\n",
      "10369 49\n",
      "10406 68\n",
      "10448 75\n",
      "1892 51\n",
      "1893 89\n",
      "1895 77\n",
      "1976 128\n",
      "855 80\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "\n",
    "\n",
    "cd ~/data/tmp/\n",
    "\n",
    "# for each family ID\n",
    "while read fam; do\n",
    "  # get all GATK SNVs in the affected trio in the family\n",
    "  suffix=allDeNovo\n",
    "  cut -f 1,2 --output-delimiter=\":\" /data/NCR_SBRB/simplex/gatk_refine/${fam}_trio1_${suffix}.vcf | grep -v '#' - | sort | uniq > ${fam}_possible_snvs_${suffix}.txt;\n",
    "  suffix=denovo_v2\n",
    "  cut -f 1,2 --output-delimiter=\":\" /data/NCR_SBRB/simplex/triodenovo/${fam}_trio1_${suffix}.vcf | grep -v '#' - | sort | uniq > ${fam}_possible_snvs_${suffix}.txt;\n",
    "  grep -Fx -f ${fam}_possible_snvs_allDeNovo.txt ${fam}_possible_snvs_denovo_v2.txt > intersect.txt;\n",
    "  echo $fam `cat intersect.txt | wc -l`\n",
    "done < /data/NCR_SBRB/simplex/famids.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As expected, this is much higher than the numbers using hiConf only. Let's calculate the full numbers for them:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10033_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10033_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10033_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10033_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10033_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10033_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10042_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10042_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10042_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10042_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10042_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10042_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10090_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10090_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10090_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10090_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10090_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10090_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10094_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10094_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10094_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10094_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10094_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10094_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10128_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10128_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10128_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10128_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10128_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10128_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10131_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10131_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10131_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10131_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10131_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10131_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10153_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10153_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10153_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10153_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10153_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10153_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10164_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10164_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10164_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10164_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10164_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10164_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10173_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10173_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10173_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10173_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10173_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10173_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10178_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10178_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10178_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10178_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10178_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10178_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10182_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10182_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10182_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10182_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10182_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10182_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10197_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10197_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10197_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10197_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10197_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10197_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10215_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10215_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10215_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10215_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10215_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10215_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10369_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10369_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10369_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10369_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10369_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10369_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10406_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10406_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10406_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10406_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10406_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10406_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 10448_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10448_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 10448_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `10448_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 10448_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 10448_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 1892_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `1892_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 1892_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `1892_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 1892_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 1892_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 1893_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `1893_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 1893_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `1893_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 1893_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 1893_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 1895_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `1895_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 1895_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `1895_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 1895_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 1895_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 1976_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `1976_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 1976_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `1976_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 1976_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 1976_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n",
      "bash: line 40: snp_pos: No such file or directory\n",
      "bash: line 26: convert2annovar.pl: command not found\n",
      "bash: line 28: annotate_variation.pl: command not found\n",
      "cat: 855_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `855_trio1_hiConfANDtriodenovo.avinput.hg19_popfreq_max_20150413_dropped' for reading (No such file or directory)\n",
      "bash: line 32: annotate_variation.pl: command not found\n",
      "cat: 855_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped: No such file or directory\n",
      "awk: cmd. line:1: fatal: cannot open file `855_trio1_hiConfANDtriodenovo.avinput.hg19_kaviar_20150923_dropped' for reading (No such file or directory)\n",
      "bash: line 35: annotate_variation.pl: command not found\n",
      "cat: 855_trio1_hiConfANDtriodenovo.avinput.hg19_avsnp142_dropped: No such file or directory\n",
      "bash: line 37: annotate_variation.pl: command not found\n",
      "cat: 855_trio1_hiConfANDtriodenovo.avinput.hg19_clinvar_20170130_dropped: No such file or directory\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "\n",
    "module load annovar\n",
    "cd ~/data/tmp/\n",
    "suffix=GATKallANDtriodenovo\n",
    "\n",
    "echo \"famid,denovo,popmax,popmax_MAFltp01,kaviar,kaviar_MAFltp01,dbSNP,ClinVar\" > rare_variants_${suffix}.csv;\n",
    "# for each family ID\n",
    "while read fam; do\n",
    "    # get all GATK SNVs in the affected trio in the family\n",
    "    cut -f 1,2 --output-delimiter=\":\" /data/NCR_SBRB/simplex/gatk_refine/${fam}_trio1_allDeNovo.vcf | grep -v '#' - | sort | uniq > ${fam}_possible_snvs_hiConfDeNovo.txt;\n",
    "    cut -f 1,2 --output-delimiter=\":\" /data/NCR_SBRB/simplex/triodenovo/${fam}_trio1_denovo_v2.vcf | grep -v '#' - | sort | uniq > ${fam}_possible_snvs_denovo_v2.txt;\n",
    "    grep -Fx -f ${fam}_possible_snvs_allDeNovo.txt ${fam}_possible_snvs_denovo_v2.txt > intersect.txt;\n",
    "\n",
    "    # the VCF from triodenovo is a bit cleaner, so let's grep from there\n",
    "    grep \"#\" /data/NCR_SBRB/simplex/triodenovo/${fam}_trio1_denovo_v2.vcf > interesting_snvs.vcf;\n",
    "    cut -f 2 -d \":\" intersect.txt > snp_pos.txt;\n",
    "    while read snv; do\n",
    "        grep ${snv} /data/NCR_SBRB/simplex/triodenovo/${fam}_trio1_denovo_v2.vcf >> interesting_snvs.vcf;\n",
    "    done < snp_pos.txt;\n",
    "    \n",
    "    # get all SNVs in the affected trio in the family\n",
    "    cut -f 1,2 interesting_snvs.vcf | grep -v '#' - | sort | uniq > ${fam}_possible_snvs_${suffix}.txt;\n",
    "    ndenovo=`cat ${fam}_possible_snvs_${suffix}.txt | wc -l`;\n",
    "    \n",
    "    # convert the file to ANNOVAR format\n",
    "    convert2annovar.pl -format vcf4old interesting_snvs.vcf > ${fam}_trio1_${suffix}.avinput;\n",
    "    # assign population statistics to the file accoring to different databases\n",
    "    annotate_variation.pl -filter -dbtype popfreq_max_20150413 ${fam}_trio1_${suffix}.avinput $ANNOVAR_DATA/hg19 -build hg19;\n",
    "    npopmax=`cat ${fam}_trio1_${suffix}.avinput.hg19_popfreq_max_20150413_dropped | wc -l`;\n",
    "    # filter based on rare variants\n",
    "    nMAFpopmax=`awk '$2 < .01' ${fam}_trio1_${suffix}.avinput.hg19_popfreq_max_20150413_dropped | wc -l`\n",
    "    annotate_variation.pl -filter -dbtype kaviar_20150923 ${fam}_trio1_${suffix}.avinput $ANNOVAR_DATA/hg19 -build hg19;\n",
    "    nkaviar=`cat ${fam}_trio1_${suffix}.avinput.hg19_kaviar_20150923_dropped | wc -l`;\n",
    "    nMAFkaviar=`awk '$2 < .01' ${fam}_trio1_${suffix}.avinput.hg19_kaviar_20150923_dropped | wc -l`\n",
    "    annotate_variation.pl -filter -dbtype avsnp142 ${fam}_trio1_${suffix}.avinput $ANNOVAR_DATA/hg19 -build hg19;\n",
    "    ndbsnp=`cat ${fam}_trio1_${suffix}.avinput.hg19_avsnp142_dropped | wc -l`;\n",
    "    annotate_variation.pl -filter -dbtype clinvar_20170130 ${fam}_trio1_${suffix}.avinput $ANNOVAR_DATA/hg19 -build hg19;\n",
    "    nclinvar=`cat ${fam}_trio1_${suffix}.avinput.hg19_clinvar_20170130_dropped | wc -l`;\n",
    "    echo $fam,$ndenovo,$npopmax,$nMAFpopmax,$nkaviar,$nMAFkaviar,$ndbsnp,$nclinvar >> rare_variants_${suffix}.csv\n",
    "done < /data/NCR_SBRB/simplex/famids.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Why aren't some variant being picked up by annovar?\n",
    "\n",
    "For example, look at this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>famid</th>\n",
       "      <th>denovo</th>\n",
       "      <th>popmax</th>\n",
       "      <th>popmax_MAFltp01</th>\n",
       "      <th>kaviar</th>\n",
       "      <th>kaviar_MAFltp01</th>\n",
       "      <th>dbSNP</th>\n",
       "      <th>ClinVar</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10033</td>\n",
       "      <td>69</td>\n",
       "      <td>19</td>\n",
       "      <td>9</td>\n",
       "      <td>38</td>\n",
       "      <td>33</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10042</td>\n",
       "      <td>81</td>\n",
       "      <td>34</td>\n",
       "      <td>17</td>\n",
       "      <td>49</td>\n",
       "      <td>43</td>\n",
       "      <td>37</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10090</td>\n",
       "      <td>89</td>\n",
       "      <td>46</td>\n",
       "      <td>16</td>\n",
       "      <td>64</td>\n",
       "      <td>56</td>\n",
       "      <td>52</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10094</td>\n",
       "      <td>80</td>\n",
       "      <td>39</td>\n",
       "      <td>18</td>\n",
       "      <td>53</td>\n",
       "      <td>45</td>\n",
       "      <td>45</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10128</td>\n",
       "      <td>54</td>\n",
       "      <td>21</td>\n",
       "      <td>11</td>\n",
       "      <td>34</td>\n",
       "      <td>29</td>\n",
       "      <td>22</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>10131</td>\n",
       "      <td>48</td>\n",
       "      <td>25</td>\n",
       "      <td>9</td>\n",
       "      <td>30</td>\n",
       "      <td>26</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>10153</td>\n",
       "      <td>70</td>\n",
       "      <td>23</td>\n",
       "      <td>10</td>\n",
       "      <td>36</td>\n",
       "      <td>27</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>10164</td>\n",
       "      <td>122</td>\n",
       "      <td>61</td>\n",
       "      <td>18</td>\n",
       "      <td>80</td>\n",
       "      <td>70</td>\n",
       "      <td>68</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>10173</td>\n",
       "      <td>101</td>\n",
       "      <td>57</td>\n",
       "      <td>22</td>\n",
       "      <td>73</td>\n",
       "      <td>59</td>\n",
       "      <td>61</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10178</td>\n",
       "      <td>519</td>\n",
       "      <td>413</td>\n",
       "      <td>59</td>\n",
       "      <td>445</td>\n",
       "      <td>361</td>\n",
       "      <td>414</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>10182</td>\n",
       "      <td>90</td>\n",
       "      <td>53</td>\n",
       "      <td>12</td>\n",
       "      <td>64</td>\n",
       "      <td>49</td>\n",
       "      <td>56</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>10197</td>\n",
       "      <td>416</td>\n",
       "      <td>339</td>\n",
       "      <td>59</td>\n",
       "      <td>370</td>\n",
       "      <td>310</td>\n",
       "      <td>339</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>10215</td>\n",
       "      <td>201</td>\n",
       "      <td>138</td>\n",
       "      <td>27</td>\n",
       "      <td>162</td>\n",
       "      <td>141</td>\n",
       "      <td>152</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>10369</td>\n",
       "      <td>948</td>\n",
       "      <td>470</td>\n",
       "      <td>247</td>\n",
       "      <td>618</td>\n",
       "      <td>530</td>\n",
       "      <td>519</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>10406</td>\n",
       "      <td>119</td>\n",
       "      <td>61</td>\n",
       "      <td>31</td>\n",
       "      <td>84</td>\n",
       "      <td>73</td>\n",
       "      <td>59</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>10448</td>\n",
       "      <td>69</td>\n",
       "      <td>29</td>\n",
       "      <td>14</td>\n",
       "      <td>41</td>\n",
       "      <td>32</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>1892</td>\n",
       "      <td>91</td>\n",
       "      <td>51</td>\n",
       "      <td>23</td>\n",
       "      <td>68</td>\n",
       "      <td>55</td>\n",
       "      <td>57</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>1893</td>\n",
       "      <td>95</td>\n",
       "      <td>46</td>\n",
       "      <td>19</td>\n",
       "      <td>62</td>\n",
       "      <td>54</td>\n",
       "      <td>52</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>1895</td>\n",
       "      <td>82</td>\n",
       "      <td>36</td>\n",
       "      <td>15</td>\n",
       "      <td>48</td>\n",
       "      <td>37</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>1976</td>\n",
       "      <td>91</td>\n",
       "      <td>33</td>\n",
       "      <td>13</td>\n",
       "      <td>56</td>\n",
       "      <td>50</td>\n",
       "      <td>36</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>855</td>\n",
       "      <td>79</td>\n",
       "      <td>28</td>\n",
       "      <td>13</td>\n",
       "      <td>42</td>\n",
       "      <td>33</td>\n",
       "      <td>30</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    famid  denovo  popmax  popmax_MAFltp01  kaviar  kaviar_MAFltp01  dbSNP  \\\n",
       "0   10033      69      19                9      38               33     21   \n",
       "1   10042      81      34               17      49               43     37   \n",
       "2   10090      89      46               16      64               56     52   \n",
       "3   10094      80      39               18      53               45     45   \n",
       "4   10128      54      21               11      34               29     22   \n",
       "5   10131      48      25                9      30               26     25   \n",
       "6   10153      70      23               10      36               27     25   \n",
       "7   10164     122      61               18      80               70     68   \n",
       "8   10173     101      57               22      73               59     61   \n",
       "9   10178     519     413               59     445              361    414   \n",
       "10  10182      90      53               12      64               49     56   \n",
       "11  10197     416     339               59     370              310    339   \n",
       "12  10215     201     138               27     162              141    152   \n",
       "13  10369     948     470              247     618              530    519   \n",
       "14  10406     119      61               31      84               73     59   \n",
       "15  10448      69      29               14      41               32     31   \n",
       "16   1892      91      51               23      68               55     57   \n",
       "17   1893      95      46               19      62               54     52   \n",
       "18   1895      82      36               15      48               37     35   \n",
       "19   1976      91      33               13      56               50     36   \n",
       "20    855      79      28               13      42               33     30   \n",
       "\n",
       "    ClinVar  \n",
       "0         0  \n",
       "1         0  \n",
       "2         0  \n",
       "3         1  \n",
       "4         0  \n",
       "5         0  \n",
       "6         0  \n",
       "7         0  \n",
       "8         0  \n",
       "9         0  \n",
       "10        0  \n",
       "11        0  \n",
       "12        1  \n",
       "13       11  \n",
       "14        0  \n",
       "15        0  \n",
       "16        0  \n",
       "17        0  \n",
       "18        0  \n",
       "19        0  \n",
       "20        0  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res = pd.read_csv('/data/NCR_SBRB/simplex/gatk_refine/rare_variants_hiConfDeNovo.csv')\n",
    "res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So, over half of the variants are not being picked up by popmax. Fewer by Kaviar, but still. Is there anything in common between those variants? I'll also try running snpEff in a separate notebook to see if it gives me more info.\n",
    "\n",
    "snpEff had the same issue that some variants were not annotated... asked a question in Biostars about it."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using all variants\n",
    "\n",
    "I got a couple answers in Biostars saying that I should certainly consider those variants that didn't come up in the databases. So, let's try it, again with the intersection of triodenovo and hiConf:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%bash\n",
    "\n",
    "module load annovar\n",
    "cd ~/data/tmp/\n",
    "suffix=hiConfANDtriodenovo\n",
    "\n",
    "echo 'famid,popmaxMAFltp01ANDNotFound,affectedOnly,exonic,intronic,splicing,intergenic,UTR?,[up/down]stream' > rare_variantsWithNotFound_affectedOnly_${suffix}.csv\n",
    "\n",
    "while read fam; do\n",
    "    awk '$2 < .01 {OFS=\":\"; print $3, $4}' ${fam}_trio1_${suffix}.avinput.hg19_popfreq_max_20150413_dropped > ${fam}_${suffix}_popmaxMAFltp01ANDNotFound.txt;\n",
    "    awk '{OFS=\":\"; print $1, $2}' ${fam}_trio1_${suffix}.avinput.hg19_popfreq_max_20150413_filtered >> ${fam}_${suffix}_popmaxMAFltp01ANDNotFound.txt;\n",
    "    if [ -e /data/NCR_SBRB/simplex/triodenovo/${fam}_trio2_denovo_v2.vcf ]; then\n",
    "        rm possibleAndNotFound_snvs_${fam}_${suffix}.txt unaffected_snvs.txt\n",
    "        # let's be conservative here and disregard the SNV if it was picked up\n",
    "        # by hiConf OR triodenovo unaffected\n",
    "        cat /data/NCR_SBRB/simplex/gatk_refine/${fam}_trio[2..4]_hiConfDeNovo.vcf | grep -v '#' - | awk 'BEGIN {FS=\"\\t\"; OFS=\":\"}; {print $1, $2}' - | sort | uniq >> unaffected_snvs.txt;\n",
    "        cat /data/NCR_SBRB/simplex/triodenovo/${fam}_trio[2..4]_denovo_v2.vcf | grep -v '#' - | awk 'BEGIN {FS=\"\\t\"; OFS=\":\"}; {print $1, $2}' - | sort | uniq >> unaffected_snvs.txt;\n",
    "        # for each possible SNV in affected trio, mark it as interesting if it's not\n",
    "        # in the unnafected trios\n",
    "        while read snv; do\n",
    "            if ! grep -q \"$snv\" unaffected_snvs.txt; then\n",
    "                echo $snv >> possibleAndNotFound_snvs_${fam}_${suffix}.txt;\n",
    "            fi;\n",
    "        done < ${fam}_${suffix}_popmaxMAFltp01ANDNotFound.txt;\n",
    "        naffected=`cat possibleAndNotFound_snvs_${fam}_${suffix}.txt | wc -l`\n",
    "    else\n",
    "        naffected='NA'\n",
    "        cp ${fam}_${suffix}_popmaxMAFltp01ANDNotFound.txt possibleAndNotFound_snvs_${fam}_${suffix}.txt\n",
    "    fi;\n",
    "    nMAFpopmax=`cat ${fam}_${suffix}_popmaxMAFltp01ANDNotFound.txt | wc -l`\n",
    "    \n",
    "    # figuring where the variants fall in the DNA\n",
    "    rm ${fam}_trio1_possibleAndNotFound_${suffix}.avinput;\n",
    "    for pos in `cut -f 2 -d \":\" possibleAndNotFound_snvs_${fam}_${suffix}.txt`; do\n",
    "        grep $pos ${fam}_trio1_${suffix}.avinput >> ${fam}_trio1_possibleAndNotFound_${suffix}.avinput;\n",
    "    done\n",
    "    annotate_variation.pl -buildver hg19 ${fam}_trio1_possibleAndNotFound_${suffix}.avinput $ANNOVAR_DATA/hg19;\n",
    "    nexon=`grep exonic ${fam}_trio1_possibleAndNotFound_${suffix}.avinput.variant_function | wc -l`;\n",
    "    nintron=`grep intronic ${fam}_trio1_possibleAndNotFound_${suffix}.avinput.variant_function | wc -l`;\n",
    "    nsplice=`grep splicing ${fam}_trio1_possibleAndNotFound_${suffix}.avinput.variant_function | wc -l`;\n",
    "    ninter=`grep intergenic ${fam}_trio1_possibleAndNotFound_${suffix}.avinput.variant_function | wc -l`;\n",
    "    nutr=`grep UTR ${fam}_trio1_possibleAndNotFound_${suffix}.avinput.variant_function | wc -l`;\n",
    "    ndown=`grep stream ${fam}_trio1_possibleAndNotFound_${suffix}.avinput.variant_function | wc -l`;\n",
    "    echo $fam,$nMAFpopmax,$naffected,$nexon,$nintron,$nsplice,$ninter,$nutr,$ndown >> rare_variantsWithNotFound_affectedOnly_${suffix}.csv;\n",
    "done < /data/NCR_SBRB/simplex/famids.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>famid</th>\n",
       "      <th>popmaxMAFltp01ANDNotFound</th>\n",
       "      <th>affectedOnly</th>\n",
       "      <th>exonic</th>\n",
       "      <th>intronic</th>\n",
       "      <th>splicing</th>\n",
       "      <th>intergenic</th>\n",
       "      <th>UTR?</th>\n",
       "      <th>[up/down]stream</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10033</td>\n",
       "      <td>36</td>\n",
       "      <td>35</td>\n",
       "      <td>3</td>\n",
       "      <td>14</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10042</td>\n",
       "      <td>30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10090</td>\n",
       "      <td>31</td>\n",
       "      <td>29</td>\n",
       "      <td>7</td>\n",
       "      <td>14</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10094</td>\n",
       "      <td>24</td>\n",
       "      <td>23</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10128</td>\n",
       "      <td>28</td>\n",
       "      <td>26</td>\n",
       "      <td>8</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>10131</td>\n",
       "      <td>21</td>\n",
       "      <td>17</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>10153</td>\n",
       "      <td>32</td>\n",
       "      <td>30</td>\n",
       "      <td>6</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>10164</td>\n",
       "      <td>31</td>\n",
       "      <td>30</td>\n",
       "      <td>10</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>10173</td>\n",
       "      <td>22</td>\n",
       "      <td>20</td>\n",
       "      <td>5</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10178</td>\n",
       "      <td>49</td>\n",
       "      <td>47</td>\n",
       "      <td>15</td>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>10182</td>\n",
       "      <td>23</td>\n",
       "      <td>21</td>\n",
       "      <td>8</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>10197</td>\n",
       "      <td>26</td>\n",
       "      <td>24</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>10215</td>\n",
       "      <td>32</td>\n",
       "      <td>32</td>\n",
       "      <td>11</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>10369</td>\n",
       "      <td>19</td>\n",
       "      <td>17</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>10406</td>\n",
       "      <td>26</td>\n",
       "      <td>26</td>\n",
       "      <td>10</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>10448</td>\n",
       "      <td>28</td>\n",
       "      <td>28</td>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>1892</td>\n",
       "      <td>24</td>\n",
       "      <td>24</td>\n",
       "      <td>6</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>1893</td>\n",
       "      <td>37</td>\n",
       "      <td>37</td>\n",
       "      <td>3</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>1895</td>\n",
       "      <td>34</td>\n",
       "      <td>32</td>\n",
       "      <td>9</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>1976</td>\n",
       "      <td>42</td>\n",
       "      <td>41</td>\n",
       "      <td>9</td>\n",
       "      <td>19</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>855</td>\n",
       "      <td>37</td>\n",
       "      <td>35</td>\n",
       "      <td>11</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    famid  popmaxMAFltp01ANDNotFound  affectedOnly  exonic  intronic  \\\n",
       "0   10033                         36            35       3        14   \n",
       "1   10042                         30           NaN       8        12   \n",
       "2   10090                         31            29       7        14   \n",
       "3   10094                         24            23       5         6   \n",
       "4   10128                         28            26       8         7   \n",
       "5   10131                         21            17       3         4   \n",
       "6   10153                         32            30       6        10   \n",
       "7   10164                         31            30      10        13   \n",
       "8   10173                         22            20       5         9   \n",
       "9   10178                         49            47      15        17   \n",
       "10  10182                         23            21       8         4   \n",
       "11  10197                         26            24       1         6   \n",
       "12  10215                         32            32      11         7   \n",
       "13  10369                         19            17       5         1   \n",
       "14  10406                         26            26      10         8   \n",
       "15  10448                         28            28       4         9   \n",
       "16   1892                         24            24       6        12   \n",
       "17   1893                         37            37       3        20   \n",
       "18   1895                         34            32       9        12   \n",
       "19   1976                         42            41       9        19   \n",
       "20    855                         37            35      11         8   \n",
       "\n",
       "    splicing  intergenic  UTR?  [up/down]stream  \n",
       "0          1          10     6                1  \n",
       "1          0           4     6                0  \n",
       "2          0           3     5                1  \n",
       "3          1           4     7                0  \n",
       "4          0           5     5                1  \n",
       "5          0           3     6                1  \n",
       "6          0           8     4                2  \n",
       "7          0           4     2                1  \n",
       "8          0           1     4                1  \n",
       "9          0           5     8                2  \n",
       "10         0           8     0                1  \n",
       "11         0          11     6                0  \n",
       "12         0           8     4                2  \n",
       "13         0           9     1                1  \n",
       "14         0           4     4                0  \n",
       "15         2           8     5                0  \n",
       "16         0           6     0                0  \n",
       "17         0           5     8                1  \n",
       "18         0           6     4                1  \n",
       "19         0           7     4                2  \n",
       "20         0          10     4                2  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res = pd.read_csv('/home/sudregp/data/tmp/rare_variantsWithNotFound_affectedOnly_hiConfANDtriodenovo.csv')\n",
    "res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For completeness, let's spit out the genes in the exomic regions as well:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ":LILRB5:NDUFS7\n",
      ":ANKRD36C:DCAF15:MST1P2:MUC16:RBMXL1\n",
      ":BRDT:FCGR2A:FRG1BP,FRG1DP:MOG:NBPF8,NBPF9:SLC15A1:TEKT4\n",
      ":CREB3L1:MFF:MST1L:MUC16:TMPPE\n",
      ":LOC644762:POTEG:SKA3:ZNF326:ZNF781\n",
      ":ANKRD36C:FRG1BP:MST1P2\n",
      ":FAM47C:FRG1JP:GPR75:SPATA31D3\n",
      ":ANKRD36C:GP1BA:LILRA6,LILRB3:MUC12:OR11H2:OR5K2:PRAMEF10,PRAMEF33:PRAMEF11\n",
      ":FAM182B:MUC7:SLC25A5:TPSAB1:WASH3P\n",
      ":CSPG4:FRG1BP,FRG1DP:GABBR2:LOC440570:MFF:MST1P2:MUC6:NBPF1:NDUFV1:NOC2LP2:PNMA1:VN1R4\n",
      ":ADH5:ANKRD20A12P:C16orf52:CHD2:FAM86FP:FAM90A25P:TBC1D9\n",
      ":AGRN\n",
      ":AMOTL1:ANKRD20A8P:AQP7:ATP8B2:FAM182A:GOLGA8EP:LRRC37A4P:MST1P2:OR4A16:TEKT4P2:TNC\n",
      ":MUC6:NUTM2B:OR8G1\n",
      ":ELFN1:PMS2CL:UNC93A:XPOT:ZNF66\n",
      ":SEMG1:STIM1:ZNF890P\n",
      ":COL4A6:LOC441666:NUTM2B:OR6N2:TYRO3\n",
      ":ADAM20:CHD4:GGT1\n",
      ":HEATR1:MST1P2:MUC22:MUC5B:MUC6:PSMD1:TEKT4P2\n",
      ":ANKRD19P:CRACR2A:FRG1BP,FRG1DP:LOC102725072:MLLT10P1:MUC12:PCDHA8:UNC5A\n",
      ":COL4A4:MLLT10P1:MUC22:TAS2R30:ZNF717:ZNF98\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "\n",
    "\n",
    "cd ~/data/tmp/\n",
    "suffix=hiConfANDtriodenovo\n",
    "fam=10033\n",
    "\n",
    "while read fam; do\n",
    "    genes=''\n",
    "    for gene in `grep exonic ${fam}_trio1_possibleAndNotFound_${suffix}.avinput.variant_function | cut -f 2 | sort | uniq`; do\n",
    "        genes=${genes}:${gene}\n",
    "    done\n",
    "    echo $genes\n",
    "done < /data/NCR_SBRB/simplex/famids.txt\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:py2.7.10]",
   "language": "python",
   "name": "conda-env-py2.7.10-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  },
  "nav_menu": {},
  "toc": {
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 6,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
